{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 作业四：\n",
    "在https://www.kaggle.com/c/titanic Kaggle网站上下载和处理泰坦尼克数据集 15分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#数据读取\n",
    "data_s=pd.read_csv(\"train.csv\")\n",
    "data_s.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 文件headers解释\n",
    "- PassengerId:乘客ID\n",
    "- Survived:是否存活\n",
    "- Pclass:仓位等级\n",
    "- Name:姓名\n",
    "- Sex:性别\n",
    "- Age:年龄\n",
    "- SibSp:(sibling+spouse):兄弟姐妹与配偶\n",
    "- Parch:(parents+children):父母与子女\n",
    "- Ticket:票号\n",
    "- Fare:票价\n",
    "- Cabin:仓位\n",
    "- Embarked:登船港口"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      "PassengerId    891 non-null int64\n",
      "Survived       891 non-null int64\n",
      "Pclass         891 non-null int64\n",
      "Name           891 non-null object\n",
      "Sex            891 non-null object\n",
      "Age            714 non-null float64\n",
      "SibSp          891 non-null int64\n",
      "Parch          891 non-null int64\n",
      "Ticket         891 non-null object\n",
      "Fare           891 non-null float64\n",
      "Cabin          204 non-null object\n",
      "Embarked       889 non-null object\n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 83.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 文件信息\n",
    "- 总共有891行信息\n",
    "    - 其中Age714行信息，有缺失，后面的处理方法待定\n",
    "    - Cabin：乘客所在船舱，后续根据pclass，fare是否能补充\n",
    "    - Embarked：登船港口缺失两个值，考虑是否忽略数据缺失\n",
    "    - 其他数据不缺失值\n",
    "    - Name,Sex,Ticket,Cabin,Embarked都是object值\n",
    "        - 后续Sex可以转换为独热编码\n",
    "        - PassageId，Name，Ticket都是用来索引数据的，可以不考虑"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    549\n",
       "1    342\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[\"Survived\"].value_counts()\n",
    "#549人遇害，342人存活"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3    491\n",
       "1    216\n",
       "2    184\n",
       "Name: Pclass, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[\"Pclass\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    136\n",
       "0     80\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Pclass\"]==1][\"Survived\"].value_counts()\n",
    "#1级仓存活136人，遇害80人"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    97\n",
       "1    87\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Pclass\"]==2][\"Survived\"].value_counts()\n",
    "#2级仓存活87人，遇害97人"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    372\n",
       "1    119\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Pclass\"]==3][\"Survived\"].value_counts()\n",
    "#3级仓存活119人，遇害372人"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "80.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[\"Age\"].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.42"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[\"Age\"].min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>803</td>\n",
       "      <td>804</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Thomas, Master. Assad Alexander</td>\n",
       "      <td>male</td>\n",
       "      <td>0.42</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2625</td>\n",
       "      <td>8.5167</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                             Name   Sex  \\\n",
       "803          804         1       3  Thomas, Master. Assad Alexander  male   \n",
       "\n",
       "      Age  SibSp  Parch Ticket    Fare Cabin Embarked  \n",
       "803  0.42      0      1   2625  8.5167   NaN        C  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Age\"]==0.42]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>630</td>\n",
       "      <td>631</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Barkworth, Mr. Algernon Henry Wilson</td>\n",
       "      <td>male</td>\n",
       "      <td>80.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>27042</td>\n",
       "      <td>30.0</td>\n",
       "      <td>A23</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                                  Name  \\\n",
       "630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   \n",
       "\n",
       "      Sex   Age  SibSp  Parch Ticket  Fare Cabin Embarked  \n",
       "630  male  80.0      0      0  27042  30.0   A23        S  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Age\"]==80]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    608\n",
       "1    209\n",
       "2     28\n",
       "4     18\n",
       "3     16\n",
       "8      7\n",
       "5      5\n",
       "Name: SibSp, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#兄弟姐妹\n",
    "data_s[\"SibSp\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>159</td>\n",
       "      <td>160</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Master. Thomas Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>180</td>\n",
       "      <td>181</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Miss. Constance Gladys</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>201</td>\n",
       "      <td>202</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Mr. Frederick</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>324</td>\n",
       "      <td>325</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Mr. George John Jr</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>792</td>\n",
       "      <td>793</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Miss. Stella Anna</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>846</td>\n",
       "      <td>847</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Mr. Douglas Bullen</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>863</td>\n",
       "      <td>864</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Miss. Dorothy Edith \"Dolly\"</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                               Name     Sex  \\\n",
       "159          160         0       3         Sage, Master. Thomas Henry    male   \n",
       "180          181         0       3       Sage, Miss. Constance Gladys  female   \n",
       "201          202         0       3                Sage, Mr. Frederick    male   \n",
       "324          325         0       3           Sage, Mr. George John Jr    male   \n",
       "792          793         0       3            Sage, Miss. Stella Anna  female   \n",
       "846          847         0       3           Sage, Mr. Douglas Bullen    male   \n",
       "863          864         0       3  Sage, Miss. Dorothy Edith \"Dolly\"  female   \n",
       "\n",
       "     Age  SibSp  Parch    Ticket   Fare Cabin Embarked  \n",
       "159  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "180  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "201  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "324  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "792  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "846  NaN      8      2  CA. 2343  69.55   NaN        S  \n",
       "863  NaN      8      2  CA. 2343  69.55   NaN        S  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"SibSp\"]==8]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    678\n",
       "1    118\n",
       "2     80\n",
       "5      5\n",
       "3      5\n",
       "4      4\n",
       "6      1\n",
       "Name: Parch, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#父母与子女\n",
    "data_s[\"Parch\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    65\n",
       "0    53\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Parch\"]==1][\"Survived\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    40\n",
       "0    40\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Parch\"]==2][\"Survived\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    3\n",
       "0    2\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Parch\"]==3][\"Survived\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    4\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Parch\"]==4][\"Survived\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    4\n",
       "1    1\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Parch\"]==5][\"Survived\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Parch\"]==6][\"Survived\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    445\n",
       "1    233\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Parch\"]==0][\"Survived\"].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 家庭成员的数据解释\n",
    "- 家里有一个成员的，活下来了65个，没活下来的有53个\n",
    "- 家里有两个成员的，活下来了40个，没活下来的有40个\n",
    "- 家里有三个成员的，活下来了3 个，没活下来的有2 个\n",
    "- 家里有四个成员的，活下来了  个，没活下来的有4 个\n",
    "- 家里有五个成员的，活下来了1 个，没活下来的有4 个\n",
    "- 家里有六个成员的，活下来了  个，他们都没有活下来，是Sage一家\n",
    "- 自己一个人登船的，活下来了223个，失去生命的445个"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    233\n",
       "0     81\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Sex\"]==\"female\"][\"Survived\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    468\n",
       "1    109\n",
       "Name: Survived, dtype: int64"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Sex\"]==\"male\"][\"Survived\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 总共有891人，其中女性有314，活下来的有233，存活比例74.2%\n",
    "# 总共有891人，其中男性有577，活下来的有109，存活比例18.9%"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# index顺序打乱\n",
    "数据集去掉是否生存字段，并且将该字段单独取出来作为标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Pclass                                               Name  \\\n",
       "0            1       3                            Braund, Mr. Owen Harris   \n",
       "1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   \n",
       "2            3       3                             Heikkinen, Miss. Laina   \n",
       "3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   \n",
       "4            5       3                           Allen, Mr. William Henry   \n",
       "\n",
       "      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  \n",
       "0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  \n",
       "1  female  38.0      1      0          PC 17599  71.2833   C85        C  \n",
       "2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3  female  35.0      1      0            113803  53.1000  C123        S  \n",
       "4    male  35.0      0      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shuffle_index=np.random.permutation(891)\n",
    "X=data_s.drop(\"Survived\",axis=1)\n",
    "y=data_s[\"Survived\"]\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      0\n",
       "1      1\n",
       "2      1\n",
       "3      1\n",
       "4      0\n",
       "      ..\n",
       "886    0\n",
       "887    1\n",
       "888    0\n",
       "889    1\n",
       "890    0\n",
       "Name: Survived, Length: 891, dtype: int64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 各舱位票价：英镑\n",
    "- 头等舱：84.2\n",
    "- 二等舱：20.7\n",
    "- 三等舱：13.7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13.675550101832993"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Pclass\"]==3][\"Fare\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20.662183152173913"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Pclass\"]==2][\"Fare\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "84.1546875"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Pclass\"]==1][\"Fare\"].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Attention!!!\n",
    "\n",
    "- 对于任何一份数据都应该客观冷静处理，首先其是一份数据，先按照数据的处理流程去做，因为现在还是初学：学习阶段，不应该乱七八糟的理论掺杂进来，先做事\n",
    "- 下一个阶段是把这份数据处理好，这个是下个阶段，因为行百里者半九十，你做到前面处理好了，不能就以为做完了，下个阶段要开始理顺整个逻辑，因为作为要接收你的几分钟的胡扯，你应该对你的接收者稍微负责一点，做的有思想一点，做好看是之后的事情\n",
    "- 如果你真有能力了，再谈对数据的理解吧，因为每个正常人对单个数据的理解都有其角度，你的不一定就高过别人，所以别老哔哔。反而是随机过程，如果里面真有知识，或者能佐证你想要的观点，确实值得挖掘，不过还是不要随意表达自己幼稚的观点好了。\n",
    "- 埋头做事，别人提意见就听听，然后继续埋头做你的事。\n",
    "\n",
    "- 看看泰坦尼克号就知道了。\n",
    "    - 海事规则：救生艇乘纳1/3人；Titanic超过一半；\n",
    "    - 救生艇能救1178；最后上去了651\n",
    "    - 救生艇能装65人，但是救生艇上面不超过65人\n",
    "    - 左侧妇女儿童上，男性不可以；右侧妇女儿童完了，男性可以上\n",
    "    - Titanic估计两小时就沉没了，实际接近2小时40分钟\n",
    "    - 救生艇半空中就往下直接放"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 开始做作业--流程\n",
    "- 数据分析\n",
    "    - 数据概览：done\n",
    "    - 测定基准：还没讲，我自己之后看视频慢慢补充进来\n",
    "    - 分析特征重要性：Sex(male+female)；Age(youth<16;adult(16<= <=60));the old(>60)；Pclass(123)；Family(SibSp+Parch)\n",
    "    - 数据展示（数据可视化）：我自己不喜欢可视化（但这个是学习过程，放下你的偏见）：暂定\n",
    "- 特征工程\n",
    "    - 数据清洗:done\n",
    "    - 提取新特征（Family（0：Alone+1：Family））:done\n",
    "    - 特征选取:done\n",
    "    - Final_data:done\n",
    "- 机器学习\n",
    "    - 算法选择\n",
    "    - 训练及评估\n",
    "    - 预测结果\n",
    "    - 人物画像\n",
    "- 解决疑问：\n",
    "    - 那么其他特征值分布比较离散的，对最终的存活率影响是怎样的呢？\n",
    "    - "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      "PassengerId    891 non-null int64\n",
      "Survived       891 non-null int64\n",
      "Pclass         891 non-null int64\n",
      "Name           891 non-null object\n",
      "Sex            891 non-null object\n",
      "Age            714 non-null float64\n",
      "SibSp          891 non-null int64\n",
      "Parch          891 non-null int64\n",
      "Ticket         891 non-null object\n",
      "Fare           891 non-null float64\n",
      "Cabin          204 non-null object\n",
      "Embarked       889 non-null object\n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 83.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>61</td>\n",
       "      <td>62</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Icard, Miss. Amelie</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>829</td>\n",
       "      <td>830</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Stone, Mrs. George Nelson (Martha Evelyn)</td>\n",
       "      <td>female</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                                       Name  \\\n",
       "61            62         1       1                        Icard, Miss. Amelie   \n",
       "829          830         1       1  Stone, Mrs. George Nelson (Martha Evelyn)   \n",
       "\n",
       "        Sex   Age  SibSp  Parch  Ticket  Fare Cabin Embarked  \n",
       "61   female  38.0      0      0  113572  80.0   B28      NaN  \n",
       "829  female  62.0      0      0  113572  80.0   B28      NaN  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Embarked\"].isnull()]#该空值如何操作呢？"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>54</td>\n",
       "      <td>55</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Ostby, Mr. Engelhart Cornelius</td>\n",
       "      <td>male</td>\n",
       "      <td>65.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>113509</td>\n",
       "      <td>61.9792</td>\n",
       "      <td>B30</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>61</td>\n",
       "      <td>62</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Icard, Miss. Amelie</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0000</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>166</td>\n",
       "      <td>167</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Chibnall, Mrs. (Edith Martha Bowerman)</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>113505</td>\n",
       "      <td>55.0000</td>\n",
       "      <td>E33</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>252</td>\n",
       "      <td>253</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Stead, Mr. William Thomas</td>\n",
       "      <td>male</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113514</td>\n",
       "      <td>26.5500</td>\n",
       "      <td>C87</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>351</td>\n",
       "      <td>352</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Williams-Lambert, Mr. Fletcher Fellows</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113510</td>\n",
       "      <td>35.0000</td>\n",
       "      <td>C128</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                                    Name  \\\n",
       "54            55         0       1          Ostby, Mr. Engelhart Cornelius   \n",
       "61            62         1       1                     Icard, Miss. Amelie   \n",
       "166          167         1       1  Chibnall, Mrs. (Edith Martha Bowerman)   \n",
       "252          253         0       1               Stead, Mr. William Thomas   \n",
       "351          352         0       1  Williams-Lambert, Mr. Fletcher Fellows   \n",
       "\n",
       "        Sex   Age  SibSp  Parch  Ticket     Fare Cabin Embarked  \n",
       "54     male  65.0      0      1  113509  61.9792   B30        C  \n",
       "61   female  38.0      0      0  113572  80.0000   B28      NaN  \n",
       "166  female   NaN      0      1  113505  55.0000   E33        S  \n",
       "252    male  62.0      0      0  113514  26.5500   C87        S  \n",
       "351    male   NaN      0      0  113510  35.0000  C128        S  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Ticket\"].str.contains(r\"1135\")].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Embarked补充\n",
    "根据Cabine为B28附件的港口，S/C都有，所以暂时不能确定\n",
    "取Ticket前面有1135的票号的也存在C/S\n",
    "选离B28最近的B30的登录港口C"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s.loc[61,\"Embarked\"]=\"C\"\n",
    "data_s.loc[829,\"Embarked\"]=\"C\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]\n",
       "Index: []"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Embarked\"].isnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PassengerId                     62\n",
       "Survived                         1\n",
       "Pclass                           1\n",
       "Name           Icard, Miss. Amelie\n",
       "Sex                         female\n",
       "Age                             38\n",
       "SibSp                            0\n",
       "Parch                            0\n",
       "Ticket                      113572\n",
       "Fare                            80\n",
       "Cabin                          B28\n",
       "Embarked                         C\n",
       "Name: 61, dtype: object"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# data_s[data_s[\"PassengerId\"]==61]\n",
    "data_s.loc[61]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# cablin数据填充\n",
    "#按照 Ticket相同，cabin值一样进行补充"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Age数据补充\n",
    "youth<16;adult:16<= <=60;the old >60\n",
    "名字包含Master全部为男性，且年龄<12岁；之后归类为youth；目前按照年龄15填入\n",
    "名字包含Miss的跨度较大：0-63岁；所以不采用姓名填写；按照所有能确定年龄的占比填入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python3.7\\lib\\site-packages\\ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>65</td>\n",
       "      <td>66</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Moubarek, Master. Gerios</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2661</td>\n",
       "      <td>15.2458</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>159</td>\n",
       "      <td>160</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Sage, Master. Thomas Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "      <td>CA. 2343</td>\n",
       "      <td>69.5500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>176</td>\n",
       "      <td>177</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Lefebre, Master. Henry Forbes</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>4133</td>\n",
       "      <td>25.4667</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>709</td>\n",
       "      <td>710</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Moubarek, Master. Halim Gonios (\"William George\")</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2661</td>\n",
       "      <td>15.2458</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass  \\\n",
       "65            66         1       3   \n",
       "159          160         0       3   \n",
       "176          177         0       3   \n",
       "709          710         1       3   \n",
       "\n",
       "                                                  Name   Sex  Age  SibSp  \\\n",
       "65                            Moubarek, Master. Gerios  male  NaN      1   \n",
       "159                         Sage, Master. Thomas Henry  male  NaN      8   \n",
       "176                      Lefebre, Master. Henry Forbes  male  NaN      3   \n",
       "709  Moubarek, Master. Halim Gonios (\"William George\")  male  NaN      1   \n",
       "\n",
       "     Parch    Ticket     Fare Cabin Embarked  \n",
       "65       1      2661  15.2458   NaN        C  \n",
       "159      2  CA. 2343  69.5500   NaN        S  \n",
       "176      1      4133  25.4667   NaN        S  \n",
       "709      1      2661  15.2458   NaN        C  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[data_s[\"Name\"].str.contains(r\"Master\")][data_s[\"Age\"].isnull()==True]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s.loc[(65,159,176,709),\"Age\"]=\"15\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'15'"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s.loc[65,\"Age\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24.00    30\n",
       "22.00    27\n",
       "18.00    26\n",
       "28.00    25\n",
       "19.00    25\n",
       "         ..\n",
       "70.50     1\n",
       "0.42      1\n",
       "66.00     1\n",
       "53.00     1\n",
       "34.50     1\n",
       "Name: Age, Length: 89, dtype: int64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[\"Age\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 173 entries, 5 to 888\n",
      "Data columns (total 12 columns):\n",
      "PassengerId    173 non-null int64\n",
      "Survived       173 non-null int64\n",
      "Pclass         173 non-null int64\n",
      "Name           173 non-null object\n",
      "Sex            173 non-null object\n",
      "Age            0 non-null object\n",
      "SibSp          173 non-null int64\n",
      "Parch          173 non-null int64\n",
      "Ticket         173 non-null object\n",
      "Fare           173 non-null float64\n",
      "Cabin          19 non-null object\n",
      "Embarked       173 non-null object\n",
      "dtypes: float64(1), int64(5), object(6)\n",
      "memory usage: 17.6+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s[data_s[\"Age\"].isnull()].info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s_Age=data_s.dropna(subset=[\"Age\"],axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python3.7\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "data_s_Age[\"Age\"]=data_s_Age[\"Age\"].astype(\"float\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 718 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      "PassengerId    718 non-null int64\n",
      "Survived       718 non-null int64\n",
      "Pclass         718 non-null int64\n",
      "Name           718 non-null object\n",
      "Sex            718 non-null object\n",
      "Age            718 non-null float64\n",
      "SibSp          718 non-null int64\n",
      "Parch          718 non-null int64\n",
      "Ticket         718 non-null object\n",
      "Fare           718 non-null float64\n",
      "Cabin          185 non-null object\n",
      "Embarked       718 non-null object\n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 72.9+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s_Age.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python3.7\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "data_s_Age[\"Age\"]=pd.cut(data_s_Age[\"Age\"],bins=[0,16,60,np.inf],labels=[\"youth\",\"adult\",\"the_old\"]) #pd.cut是按区间分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "adult      592\n",
       "youth      104\n",
       "the_old     22\n",
       "Name: Age, dtype: int64"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_Age[\"Age\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "youth:13.5%\n",
      "adult:82.5%\n",
      "the old:3.1%\n"
     ]
    }
   ],
   "source": [
    "print('youth:{:.1f}%'.format(100*97/len(data_s_Age['Age'])))\n",
    "print('adult:{:.1f}%'.format(100*592/len(data_s_Age['Age'])))\n",
    "print('the old:{:.1f}%'.format(100*22/len(data_s_Age['Age'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python3.7\\lib\\site-packages\\pandas\\core\\indexing.py:205: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._setitem_with_indexer(indexer, value)\n"
     ]
    }
   ],
   "source": [
    "data_s_null=np.where(pd.isna(data_s['Age']))[0] #isna==isnull\n",
    "data_s['Age'].loc[data_s_null[0:int(len(data_s_null)*0.135)]]=15"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s['Age'].loc[data_s_null[int(len(data_s_null)*0.135):int(0.967*len(data_s_null))]]=40"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s['Age'].loc[data_s_null[int(0.967*len(data_s_null)):]]=65"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s['Age']=data_s['Age'].astype('float')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s['Age']=pd.cut(data_s['Age'],bins=[0,16,60,np.inf],labels=[0,1,2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male   1      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   1      1   \n",
       "2                             Heikkinen, Miss. Laina  female   1      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   1      1   \n",
       "4                           Allen, Mr. William Henry    male   1      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#添加一个新的特征Family\n",
    "- 如果SibSp+Parch=0；则为Alone：记为0\n",
    "- 否则记为Family：记为1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      1\n",
       "1      1\n",
       "2      0\n",
       "3      1\n",
       "4      0\n",
       "      ..\n",
       "886    0\n",
       "887    0\n",
       "888    3\n",
       "889    0\n",
       "890    0\n",
       "Name: Family, Length: 891, dtype: int64"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[\"Family\"]=data_s[\"SibSp\"]+data_s[\"Parch\"]\n",
    "data_s['Family']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python3.7\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "e:\\python3.7\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "data_s[\"Family\"][data_s[\"Family\"]>=1]=1\n",
    "data_s['Family'][data_s['Family']==0]=0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      1\n",
       "1      1\n",
       "2      0\n",
       "3      1\n",
       "4      0\n",
       "      ..\n",
       "886    0\n",
       "887    0\n",
       "888    1\n",
       "889    0\n",
       "890    0\n",
       "Name: Family, Length: 891, dtype: int64"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[\"Family\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "e:\\python3.7\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "e:\\python3.7\\lib\\site-packages\\ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "data_s['Sex'][data_s[\"Sex\"]==\"male\"]=1\n",
    "data_s['Sex'][data_s[\"Sex\"]==\"female\"]=0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    736\n",
       "0    127\n",
       "2     28\n",
       "Name: Age, dtype: int64"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s[\"Age\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    577\n",
       "0    314\n",
       "Name: Sex, dtype: int64"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s['Sex'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 13 columns):\n",
      "PassengerId    891 non-null int64\n",
      "Survived       891 non-null int64\n",
      "Pclass         891 non-null int64\n",
      "Name           891 non-null object\n",
      "Sex            891 non-null object\n",
      "Age            891 non-null category\n",
      "SibSp          891 non-null int64\n",
      "Parch          891 non-null int64\n",
      "Ticket         891 non-null object\n",
      "Fare           891 non-null float64\n",
      "Cabin          204 non-null object\n",
      "Embarked       891 non-null object\n",
      "Family         891 non-null int64\n",
      "dtypes: category(1), float64(1), int64(6), object(5)\n",
      "memory usage: 84.6+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s[\"Sex\"]=data_s[\"Sex\"].astype(\"int\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s[\"Age\"]=data_s[\"Age\"].astype(\"int\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 13 columns):\n",
      "PassengerId    891 non-null int64\n",
      "Survived       891 non-null int64\n",
      "Pclass         891 non-null int64\n",
      "Name           891 non-null object\n",
      "Sex            891 non-null int32\n",
      "Age            891 non-null int32\n",
      "SibSp          891 non-null int64\n",
      "Parch          891 non-null int64\n",
      "Ticket         891 non-null object\n",
      "Fare           891 non-null float64\n",
      "Cabin          204 non-null object\n",
      "Embarked       891 non-null object\n",
      "Family         891 non-null int64\n",
      "dtypes: float64(1), int32(2), int64(6), object(4)\n",
      "memory usage: 83.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data_s.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Family</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>PassengerId</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.005007</td>\n",
       "      <td>-0.035144</td>\n",
       "      <td>0.042939</td>\n",
       "      <td>0.147051</td>\n",
       "      <td>-0.057527</td>\n",
       "      <td>-0.001652</td>\n",
       "      <td>0.012658</td>\n",
       "      <td>-0.057462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Survived</td>\n",
       "      <td>-0.005007</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.338481</td>\n",
       "      <td>-0.543351</td>\n",
       "      <td>-0.132034</td>\n",
       "      <td>-0.035322</td>\n",
       "      <td>0.081629</td>\n",
       "      <td>0.257307</td>\n",
       "      <td>0.203367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Pclass</td>\n",
       "      <td>-0.035144</td>\n",
       "      <td>-0.338481</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.131900</td>\n",
       "      <td>-0.185242</td>\n",
       "      <td>0.083081</td>\n",
       "      <td>0.018443</td>\n",
       "      <td>-0.549500</td>\n",
       "      <td>-0.135207</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Sex</td>\n",
       "      <td>0.042939</td>\n",
       "      <td>-0.543351</td>\n",
       "      <td>0.131900</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.082464</td>\n",
       "      <td>-0.114631</td>\n",
       "      <td>-0.245489</td>\n",
       "      <td>-0.182333</td>\n",
       "      <td>-0.303646</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Age</td>\n",
       "      <td>0.147051</td>\n",
       "      <td>-0.132034</td>\n",
       "      <td>-0.185242</td>\n",
       "      <td>0.082464</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-0.276669</td>\n",
       "      <td>-0.246816</td>\n",
       "      <td>0.035297</td>\n",
       "      <td>-0.254831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>SibSp</td>\n",
       "      <td>-0.057527</td>\n",
       "      <td>-0.035322</td>\n",
       "      <td>0.083081</td>\n",
       "      <td>-0.114631</td>\n",
       "      <td>-0.276669</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.414838</td>\n",
       "      <td>0.159651</td>\n",
       "      <td>0.584471</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Parch</td>\n",
       "      <td>-0.001652</td>\n",
       "      <td>0.081629</td>\n",
       "      <td>0.018443</td>\n",
       "      <td>-0.245489</td>\n",
       "      <td>-0.246816</td>\n",
       "      <td>0.414838</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.216225</td>\n",
       "      <td>0.583398</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Fare</td>\n",
       "      <td>0.012658</td>\n",
       "      <td>0.257307</td>\n",
       "      <td>-0.549500</td>\n",
       "      <td>-0.182333</td>\n",
       "      <td>0.035297</td>\n",
       "      <td>0.159651</td>\n",
       "      <td>0.216225</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.271832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Family</td>\n",
       "      <td>-0.057462</td>\n",
       "      <td>0.203367</td>\n",
       "      <td>-0.135207</td>\n",
       "      <td>-0.303646</td>\n",
       "      <td>-0.254831</td>\n",
       "      <td>0.584471</td>\n",
       "      <td>0.583398</td>\n",
       "      <td>0.271832</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             PassengerId  Survived    Pclass       Sex       Age     SibSp  \\\n",
       "PassengerId     1.000000 -0.005007 -0.035144  0.042939  0.147051 -0.057527   \n",
       "Survived       -0.005007  1.000000 -0.338481 -0.543351 -0.132034 -0.035322   \n",
       "Pclass         -0.035144 -0.338481  1.000000  0.131900 -0.185242  0.083081   \n",
       "Sex             0.042939 -0.543351  0.131900  1.000000  0.082464 -0.114631   \n",
       "Age             0.147051 -0.132034 -0.185242  0.082464  1.000000 -0.276669   \n",
       "SibSp          -0.057527 -0.035322  0.083081 -0.114631 -0.276669  1.000000   \n",
       "Parch          -0.001652  0.081629  0.018443 -0.245489 -0.246816  0.414838   \n",
       "Fare            0.012658  0.257307 -0.549500 -0.182333  0.035297  0.159651   \n",
       "Family         -0.057462  0.203367 -0.135207 -0.303646 -0.254831  0.584471   \n",
       "\n",
       "                Parch      Fare    Family  \n",
       "PassengerId -0.001652  0.012658 -0.057462  \n",
       "Survived     0.081629  0.257307  0.203367  \n",
       "Pclass       0.018443 -0.549500 -0.135207  \n",
       "Sex         -0.245489 -0.182333 -0.303646  \n",
       "Age         -0.246816  0.035297 -0.254831  \n",
       "SibSp        0.414838  0.159651  0.584471  \n",
       "Parch        1.000000  0.216225  0.583398  \n",
       "Fare         0.216225  1.000000  0.271832  \n",
       "Family       0.583398  0.271832  1.000000  "
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_metric=data_s.corr()#计算两列值的相关系数\n",
    "data_s_metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Survived       1.000000\n",
       "Fare           0.257307\n",
       "Family         0.203367\n",
       "Parch          0.081629\n",
       "PassengerId   -0.005007\n",
       "SibSp         -0.035322\n",
       "Age           -0.132034\n",
       "Pclass        -0.338481\n",
       "Sex           -0.543351\n",
       "Name: Survived, dtype: float64"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_metric['Survived'].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_s_new=data_s.drop(columns=['PassengerId','Name','Ticket','Cabin','Embarked'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Family</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass  Sex  Age  SibSp  Parch     Fare  Family\n",
       "0         0       3    1    1      1      0   7.2500       1\n",
       "1         1       1    0    1      1      0  71.2833       1\n",
       "2         1       3    0    1      0      0   7.9250       0\n",
       "3         1       1    0    1      1      0  53.1000       1\n",
       "4         0       3    1    1      0      0   8.0500       0"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_new.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      False\n",
       "1       True\n",
       "2       True\n",
       "3       True\n",
       "4      False\n",
       "       ...  \n",
       "886    False\n",
       "887     True\n",
       "888    False\n",
       "889     True\n",
       "890    False\n",
       "Name: Survived, Length: 891, dtype: bool"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=data_s_new.drop('Survived',axis=1)\n",
    "target=(data_s_new['Survived']==1)\n",
    "target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n",
       "           weights='uniform')"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kn_clf=KNeighborsClassifier()\n",
    "kn_clf.fit(data,target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pclass     1.0000\n",
       "Sex        0.0000\n",
       "Age        1.0000\n",
       "SibSp      1.0000\n",
       "Parch      0.0000\n",
       "Fare      71.2833\n",
       "Family     1.0000\n",
       "Name: 1, dtype: float64"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_s_sample=data.iloc[1,:]# 取一行数据\n",
    "data_s_sample\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_s_sample=target.iloc[1]#取label第一行\n",
    "y_s_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ True])"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kn_clf.predict([data_s_sample])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import precision_score,recall_score\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.model_selection import cross_val_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.72390572, 0.77104377, 0.77104377])"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cross_val_score(kn_clf,data,target,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred=cross_val_predict(kn_clf,data,target,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precision:68.3%\n",
      "recall:67.5%\n"
     ]
    }
   ],
   "source": [
    "precision=precision_score(target,y_pred)\n",
    "recall=recall_score(target,y_pred)\n",
    "print('precision:{:.1f}%'.format(precision*100))\n",
    "print('recall:{:.1f}%'.format(recall*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "param_grid=[\n",
    "    {'weights':['uniform'],'n_neighbors':[i for i in range(1,11)]},\n",
    "    {'weights':['distance'],'n_neighbors':[i for i in range(1,11)]},\n",
    "]\n",
    "final_kn_clf=GridSearchCV(kn_clf,param_grid,cv=3,\n",
    "                         scoring='accuracy')\n",
    "final_kn_clf.fit(data,target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_kn_clf.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kn_clf_new=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
    "                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,\n",
    "                     weights='distance')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kn_clf_new.fit(data,target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cross_val_score(kn_clf_new,data,target,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_new=cross_val_predict(kn_clf_new,data,target,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "precision=precision_score(target,y_pred_new)\n",
    "recall=recall_score(target,y_pred_new)\n",
    "print('precision:{:.1f}%'.format(precision*100))\n",
    "print('recall:{:.1f}%'.format(recall*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test=pd.read_csv('test.csv')\n",
    "test.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_test=pd.read_csv('gender_submission.csv')\n",
    "target_test=target_test['Survived']\n",
    "target_test=(target_test==1)\n",
    "target_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#对年龄进行处理\n",
    "test['Sex'][test['Sex']=='male']=1\n",
    "test['Sex'][test['Sex']=='female']=0\n",
    "test['Sex']=test['Sex'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#对测试集合进行组合特征\n",
    "test['Family']=test['Parch']+test['SibSp']\n",
    "test['Family'][test['Family']==0]=0\n",
    "test['Family'][test['Family']>0]=1\n",
    "test['Family']=test['Family'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#补充Age空值\n",
    "test_null=np.where(pd.isna(test['Age']))[0]\n",
    "test['Age'].loc[test_null[0:int(len(test_null)*0.135)]]=15\n",
    "test['Age'].loc[test_null[int(len(test_null)*0.135):int(0.967*len(test_null))]]=40\n",
    "test['Age'].loc[test_null[int(0.967*len(test_null)):]]=65\n",
    "test['Age']=test['Age'].astype('float')\n",
    "test['Age']=pd.cut(test['Age'],bins=[0,16,60,np.inf],labels=[0,1,2])\n",
    "test['Age'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 删掉无用属性\n",
    "test_new=test.drop(columns=['PassengerId','Name','Ticket','Cabin','Embarked'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_new.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Fare空值使用平均值来补充\n",
    "test_new['Fare'].iloc[np.where(pd.isna(test_new['Fare']))]=test_new['Fare'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_new.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_new['Age']=test_new['Age'].astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#对测试集合进行交叉验中\n",
    "test_pred=cross_val_predict(kn_clf_new,test_new,target_test,cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "precision=precision_score(target_test,test_pred)\n",
    "recall=recall_score(target_test,test_pred)\n",
    "print('precision:{:.1f}%'.format(precision*100))\n",
    "print('recall:{:.1f}%'.format(recall*100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#### 结论\n",
    "- K近邻没有选择最佳参数表现：\n",
    "    - precision:68.4%\n",
    "    - recall:67.8%\n",
    "- 选择最佳参数后模型表现\n",
    "    - precision:72.4%\n",
    "    - recall:68.1%\n",
    "- 测试集表现\n",
    "    - precision:73.9%\n",
    "    - recall:76.3%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
